cores=20
ref="/data1/jiawei_li/yizhi_chen/ref/GRCm39"
projPath="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta"
mkdir -p ${projPath}/alignment/sam/bowtie2_summary
mkdir -p ${projPath}/alignment/bam
mkdir -p ${projPath}/alignment/bed
mkdir -p ${projPath}/alignment/bedgraph
cat filenames
Foxg1
Flag
IgG
cat filenames| while read histName
do
bowtie2 --local --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 1000 -p ${cores} -x ${ref} -1 ${projPath}/${histName}_R1.fq.gz -2 ${projPath}/${histName}_R2.fq.gz -S ${projPath}/alignment/sam/${histName}_bowtie2.sam &> ${projPath}/alignment/sam/bowtie2_summary/${histName}_bowtie2.txt
done
mkdir -p $projPath/alignment/sam/fragmentLen
## Extract the 9th column from the alignment sam file which is the fragment length
cat filenames| while read histName
do
samtools view -F 0x04 $projPath/alignment/sam/${histName}_bowtie2.sam | awk -F'\t' 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($9)}' | sort | uniq -c | awk -v OFS="\t" '{print $2, $1/2}' >$projPath/alignment/sam/fragmentLen/${histName}_fragmentLen.txt
done
## R command
library(ggplot2)
library(viridis)
library(GenomicRanges)
library(stringr)
library(ggpubr)
library(magrittr)
library(dplyr)
setwd("E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/比对")
projPath = "E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/比对"
histList = c("Foxg1", "Flag", "IgG")
sampleList = histList
## Collect the alignment results from the bowtie2 alignment summary files
alignResult = c()
for(hist in sampleList){
alignRes = read.table(paste0(projPath, "/", hist, "_bowtie2.txt"), header = FALSE, fill = TRUE)
alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
histInfo = strsplit(hist, "_")[[1]]
alignResult = data.frame(Histone = histInfo[1], Replicate = histInfo[2],
SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric,
MappedFragNum_GRCm39 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric,
AlignmentRate_GRCm39 = alignRate %>% as.numeric) %>% rbind(alignResult, .)
}
alignResult$Histone = factor(alignResult$Histone, levels = histList)
alignResult %>% mutate(AlignmentRate_GRCm39 = paste0(AlignmentRate_GRCm39, "%"))
## Generate sequencing depth boxplot
pdf('alignment results.pdf',height = 10,width = 10)
fig1 = alignResult %>% ggplot(aes(x = Histone, y = SequencingDepth/1000000, fill = Histone)) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 18) +
ylab("Sequencing Depth per Million") +
xlab("") +
ggtitle("A. Sequencing Depth")
fig2 = alignResult %>% ggplot(aes(x = Histone, y = MappedFragNum_GRCm39/1000000, fill = Histone)) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 18) +
ylab("Mapped Fragments per Million") +
xlab("") +
ggtitle("B. Alignable Fragment (GRCm39)")
fig3 = alignResult %>% ggplot(aes(x = Histone, y = AlignmentRate_GRCm39, fill = Histone)) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 18) +
ylab("% of Mapped Fragments") +
xlab("") +
ggtitle("C. Alignment Rate (GRCm39)")
ggarrange(fig1, fig2, fig3, ncol = 2, nrow=2, common.legend = TRUE, legend="bottom")
dev.off()
## Collect the fragment size information
fragLen = c()
for(hist in sampleList){
histInfo = strsplit(hist, "_")[[1]]
fragLen = read.table(paste0(projPath, "/", hist, "_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Histone = histInfo[1], Replicate = histInfo[2], sampleInfo = hist) %>% rbind(fragLen, .) }
fragLen$sampleInfo = factor(fragLen$sampleInfo, levels = sampleList)
fragLen$Histone = factor(fragLen$Histone, levels = histList)
## Generate the fragment size density plot (violin plot)
pdf('fragment size distribution.pdf',height = 6,width = 12)
fig5 = fragLen %>% ggplot(aes(x = sampleInfo, y = fragLen, weight = Weight, fill = Histone)) +
geom_violin(bw = 5) +
scale_y_continuous(breaks = seq(0, 800, 50)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 20) +
ggpubr::rotate_x_text(angle = 20) +
ylab("Fragment Length") +
xlab("")
fig6 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Histone, group = sampleInfo, linetype = sampleInfo)) +
geom_line(linewidth = 1) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma") +
theme_bw(base_size = 20) +
xlab("Fragment Length") +
ylab("Count") +
coord_cartesian(xlim = c(0, 500))
ggarrange(fig5, fig6, ncol = 2)
dev.off()
##== linux command ==##
## depending on how you load picard and your server environment, the picardCMD can be different. Adjust accordingly.
picardCMD=" java -jar /data1/jiawei_li/yizhi_chen/foxg1_cut_tag/picard/build/libs/picard.jar"
mkdir -p $projPath/alignment/removeDuplicate/picard_summary
## Sort by coordinate
cat filenames| while read histName
do
$picardCMD SortSam I=$projPath/alignment/sam/${histName}_bowtie2.sam O=$projPath/alignment/sam/${histName}_bowtie2.sorted.sam SORT_ORDER=coordinate
done
## mark duplicates
cat filenames| while read histName
do
$picardCMD MarkDuplicates I=$projPath/alignment/sam/${histName}_bowtie2.sorted.sam O=$projPath/alignment/removeDuplicate/${histName}_bowtie2.sorted.dupMarked.sam METRICS_FILE=$projPath/alignment/removeDuplicate/picard_summary/${histName}_picard.dupMark.txt
done
## remove duplicates
cat filenames| while read histName
do
$picardCMD MarkDuplicates I=$projPath/alignment/sam/${histName}_bowtie2.sorted.sam O=$projPath/alignment/removeDuplicate/${histName}_bowtie2.sorted.rmDup.sam REMOVE_DUPLICATES=true METRICS_FILE=$projPath/alignment/removeDuplicate/picard_summary/${histName}_picard.rmDup.txt
done
##=== R command ===##
library(ggplot2)
library(viridis)
library(GenomicRanges)
library(stringr)
library(ggpubr)
library(magrittr)
library(dplyr)
setwd("E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/picard")
projPath = "E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序"
histList = c("Foxg1", "Flag", "IgG")
sampleList = histList
alignResult = c()
for(hist in sampleList){
alignRes = read.table(paste0(projPath, "/比对/", hist, "_bowtie2.txt"), header = FALSE, fill = TRUE)
alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
histInfo = strsplit(hist, "_")[[1]]
alignResult = data.frame(Histone = histInfo[1], Replicate = histInfo[2],
SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric,
MappedFragNum_GRCm39 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric,
AlignmentRate_GRCm39 = alignRate %>% as.numeric) %>% rbind(alignResult, .)
}
alignResult$Histone = factor(alignResult$Histone, levels = histList)
alignResult %>% mutate(AlignmentRate_GRCm39 = paste0(AlignmentRate_GRCm39, "%"))
AlignmentRate_GRCm39 = alignResult %>% mutate(AlignmentRate_GRCm39 = paste0(AlignmentRate_GRCm39, "%"))
dupResult = c()
for(hist in sampleList){
dupRes = read.table(paste0(projPath, "/picard/", hist, "_picard.rmDup.txt"), header = TRUE, fill = TRUE)
histInfo = strsplit(hist, "_")[[1]]
dupResult = data.frame(Histone = histInfo[1], Replicate = histInfo[2],
MappedFragNum_GRCm39 = dupRes$READ_PAIRS_EXAMINED[1] %>% as.character %>% as.numeric,
DuplicationRate = dupRes$PERCENT_DUPLICATION[1] %>% as.character %>% as.numeric * 100,
EstimatedLibrarySize = dupRes$ESTIMATED_LIBRARY_SIZE[1] %>% as.character %>% as.numeric) %>% mutate(UniqueFragNum = MappedFragNum_GRCm39 * (1-DuplicationRate/100)) %>% rbind(dupResult, .)
}
dupResult$Histone = factor(dupResult$Histone, levels = histList)
alignDupSummary = left_join(alignResult, dupResult, by = c("Histone", "Replicate", "MappedFragNum_GRCm39")) %>% mutate(DuplicationRate = paste0(DuplicationRate, "%"))
alignDupSummary
## generate boxplot figure for the duplication rate
fig7 = dupResult %>% ggplot(aes(x = Histone, y = DuplicationRate, fill = Histone)) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 18) +
ylab("Duplication Rate (*100%)") +
xlab("")
fig8 = dupResult %>% ggplot(aes(x = Histone, y = EstimatedLibrarySize, fill = Histone)) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 18) +
ylab("Estimated Library Size") +
xlab("")
fig9 = dupResult %>% ggplot(aes(x = Histone, y = UniqueFragNum, fill = Histone)) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 18) +
ylab("# of Unique Fragments") +
xlab("")
ggarrange(fig7, fig8, fig9, ncol = 3, common.legend = TRUE, legend="bottom")
##== linux command ==##
## Build up effective chromsizes file
samtools faidx /data1/jiawei_li/yizhi_chen/foxg1_cut_tag/fasta/Mus_musculus.GRCm39.dna.toplevel.fa
cut -f1,2 Mus_musculus.GRCm39.dna.toplevel.fa.fai > GRCm39.chrom.sizes
## Filter and keep the mapped read pairs
cat filenames| while read histName
do
samtools view -bS -F 0x04 $projPath/alignment/sam/${histName}_bowtie2.sam >$projPath/alignment/bam/${histName}_bowtie2.mapped.bam
done
## Convert into bed file format
cat filenames| while read histName
do
bedtools bamtobed -i $projPath/alignment/bam/${histName}_bowtie2.mapped.bam -bedpe >$projPath/alignment/bed/${histName}_bowtie2.bed
done
## Keep the read pairs that are on the same chromosome and fragment length less than 1000bp.
cat filenames| while read histName
do
awk '$1==$4 && $6-$2 < 1000 {print $0}' $projPath/alignment/bed/${histName}_bowtie2.bed >$projPath/alignment/bed/${histName}_bowtie2.clean.bed
done
## Only extract the fragment related columns
cat filenames| while read histName
do
cut -f 1,2,6 $projPath/alignment/bed/${histName}_bowtie2.clean.bed | sort -k1,1 -k2,2n -k3,3n >$projPath/alignment/bed/${histName}_bowtie2.fragments.bed
done
## Scale Factor
cat filenames| while read histName
do
seqDepthDouble=`samtools view -F 0x04 $projPath/alignment/sam/${histName}_bowtie2.sam|wc -l`
seqDepth=$((seqDepthDouble/2))
if [[ "$seqDepth" -gt "1" ]]; then
mkdir -p $projPath/alignment/bedgraph
scale_factor=`echo "1000000 / $seqDepth" | bc -l` #所以RPKM就是1000000
echo "Scaling factor for $histName is: $scale_factor!"
bedtools genomecov -bg -scale $scale_factor -i $projPath/alignment/bed/${histName}_bowtie2.fragments.bed -g $chromSize > $projPath/alignment/bedgraph/${histName}_bowtie2.fragments.normalized.bedgraph
fi
done
usage: An example usage is:$ bamCoverage -b reads.bam -o coverage.bw
*bamCoverage* offers normalization by scaling factor, Reads Per Kilobase per Million mapped reads (RPKM), counts per million (CPM), bins per
million mapped reads (BPM) and 1x depth (reads per genome coverage, RPGC).
Required arguments:
--bam BAM file, -b BAM file
BAM file to process (default: None)
Output:
--outFileName FILENAME, -o FILENAME
Output file name. (default: None)
--outFileFormat {bigwig,bedgraph}, -of {bigwig,bedgraph}
Output file type. Either "bigwig" or "bedgraph". (default: bigwig)
Optional arguments:
--help, -h show this help message and exit
--scaleFactor SCALEFACTOR
The computed scaling factor (or 1, if not applicable) will be multiplied by this. (Default: 1.0)
--MNase Determine nucleosome positions from MNase-seq data. Only 3 nucleotides at the center of each fragment are counted. The fragment ends are
defined by the two mate reads. Only fragment lengthsbetween 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. By default,
any fragments smaller or larger than this are ignored. To over-ride this, use the --minFragmentLength and --maxFragmentLength options, which
will default to 130 and 200 if not otherwise specified in the presence of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is
recommended. (default: False)
--Offset INT [INT ...]
Uses this offset inside of each read as the signal. This is useful in cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past
the start of the read. This can be paired with the --filterRNAstrand option. Note that negative values indicate offsets from the end of each
read. A value of 1 indicates the first base of the alignment (taking alignment orientation into account). Likewise, a value of -1 is the last
base of the alignment. An offset of 0 is not permitted. If two values are specified, then they will be used to specify a range of positions.
Note that specifying something like --Offset 5 -1 will result in the 5th through last position being used, which is equivalent to trimming 4
bases from the 5-prime end of alignments. Note that if you specify --centerReads, the centering will be performed before the offset. (default:
None)
--filterRNAstrand {forward,reverse}
Selects RNA-seq reads (single-end or paired-end) originating from genes on the given strand. This option assumes a standard dUTP-based library
preparation (that is, --filterRNAstrand=forward keeps minus-strand reads, which originally came from genes on the forward strand using a dUTP-
based method). Consider using --samExcludeFlag instead for filtering by strand in other contexts. (default: None)
--version show program's version number and exit
--binSize INT bp, -bs INT bp
Size of the bins, in bases, for the output of the bigwig/bedgraph file. (Default: 50)
--region CHR:START:END, -r CHR:START:END
Region of the genome to limit the operation to - this is useful when testing parameters to reduce the computing time. The format is
chr:start:end, for example --region chr10 or --region chr10:456700:891000. (default: None)
--blackListFileName BED file [BED file ...], -bl BED file [BED file ...]
A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to
overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the
read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant. (default: None)
--numberOfProcessors INT, -p INT
Number of processors to use. Type "max/2" to use half the maximum number of processors or "max" to use all available processors. (Default: 1)
--verbose, -v Set to see processing messages. (default: False)
Read coverage normalization options:
--effectiveGenomeSize EFFECTIVEGENOMESIZE
The effective genome size is the portion of the genome that is mappable. Large fractions of the genome are stretches of NNNN that should be
discarded. Also, if repetitive regions were not included in the mapping of reads, the effective genome size needs to be adjusted accordingly. A
table of values is available here: http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html . (default: None)
--normalizeUsing {RPKM,CPM,BPM,RPGC,None}
Use one of the entered methods to normalize the number of reads per bin. By default,no normalization is performed.
RPKM = Reads Per Kilobase per Million mapped reads;
CPM = Counts Per Million mapped reads, same as CPM in RNA-seq;
BPM = Bins Per Million mapped reads, same as TPM in RNA-seq;
RPGC = reads per genomic content (1x normalization); Mapped reads are considered after blacklist filtering (if applied).
RPKM (per bin) = number of reads per bin / (number of mapped reads (in millions) * bin length (kb)).
CPM (per bin) = number of reads per bin / number of mapped reads (in millions).
BPM (per bin) = number of reads per bin / sum of all reads per bin (in millions). RPGC (per bin) = number of reads per bin / scaling factor for 1x average coverage. None = the default and equivalent to not setting this option at all. This scaling factor, in turn, is determined from the sequencing depth: (total number of mapped reads * fragment length) / effective genome size.
The scaling factor used is the inverse of the sequencing depth computed for the sample to match the 1x coverage. This option requires --effectiveGenomeSize. Each read is considered independently, if you want to only count one mate from a pair in paired-end data, then use the --samFlagInclude/--samFlagExclude options. (Default: None)
--exactScaling Instead of computing scaling factors based on a sampling of the reads, process all of the reads to determine the exact number that will be used
in the output. This requires significantly more time to compute, but will produce more accurate scaling factors in cases where alignments that
are being filtered are rare and lumped together. In other words, this is only needed when region-based sampling is expected to produce
incorrect results. (default: False)
--ignoreForNormalization IGNOREFORNORMALIZATION [IGNOREFORNORMALIZATION ...], -ignore IGNOREFORNORMALIZATION [IGNOREFORNORMALIZATION ...]
A list of space-delimited chromosome names containing those chromosomes that should be excluded for computing the normalization. This is useful when considering samples with unequal coverage across chromosomes, like male samples. An usage examples is --ignoreForNormalization chrX chrM.
(default: None)
--skipNonCoveredRegions, --skipNAs
This parameter determines if non-covered regions (regions without overlapping reads) in a BAM file should be skipped. The default is to treat
those regions as having a value of zero. The decision to skip non-covered regions depends on the interpretation of the data. Non-covered
regions may represent, for example, repetitive regions that should be skipped. (default: False)
--smoothLength INT bp
The smooth length defines a window, larger than the binSize, to average the number of reads. For example, if the --binSize is set to 20 and the
--smoothLength is set to 60, then, for each bin, the average of the bin and its left and right neighbors is considered. Any value smaller than
--binSize will be ignored and no smoothing will be applied. (default: None)
Read processing options:
--extendReads [INT bp], -e [INT bp]
This parameter allows the extension of reads to fragment size. If set, each read is extended, without exception. *NOTE*: This feature is
generally NOT recommended for spliced-read data, such as RNA-seq, as it would extend reads over skipped regions. *Single-end*: Requires a user
specified value for the final fragment length. Reads that already exceed this fragment length will not be extended. *Paired-end*: Reads with
mates are always extended to match the fragment size defined by the two read mates. Unmated reads, mate reads that map too far apart (>4x
fragment length) or even map to different chromosomes are treated like single-end reads. The input of a fragment length value is optional. If
no value is specified, it is estimated from the data (mean of the fragment size of all mate reads). (default: False)
--ignoreDuplicates If set, reads that have the same orientation and start position will be considered only once. If reads are paired, the mate's position also has
to coincide to ignore a read. (default: False)
--minMappingQuality INT
If set, only reads that have a mapping quality score of at least this are considered. (default: None)
--centerReads By adding this option, reads are centered with respect to the fragment length. For paired-end data, the read is centered at the fragment length
defined by the two ends of the fragment. For single-end data, the given fragment length is used. This option is useful to get a sharper signal
around enriched regions. (default: False)
--samFlagInclude INT Include reads based on the SAM flag. For example, to get only reads that are the first mate, use a flag of 64. This is useful to count properly
paired reads only once, as otherwise the second mate will be also considered for the coverage. (Default: None)
--samFlagExclude INT Exclude reads based on the SAM flag. For example, to get only reads that map to the forward strand, use --samFlagExclude 16, where 16 is the
SAM flag for reads that map to the reverse strand. (Default: None)
--minFragmentLength INT
The minimum fragment length needed for read/pair inclusion. This option is primarily useful in ATACseq experiments, for filtering mono- or di-
nucleosome fragments. (Default: 0)
--maxFragmentLength INT
The maximum fragment length needed for read/pair inclusion. (Default: 0)
##== linux command ==##
mkdir -p $projPath/alignment/bigwig
cat filenames| while read histName
do
samtools sort -o $projPath/alignment/bam/${histName}.sorted.bam $projPath/alignment/bam/${histName}_bowtie2.mapped.bam
done
cat filenames| while read histName
do
samtools index $projPath/alignment/bam/${histName}.sorted.bam
done
chromSize="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/GRCm39.chrom.sizes"
# If you had run the command with --outFileFormat bedgraph, you could easily peak into the resulting file.
cat filenames| while read histName
do
bamCoverage -b ${projPath}/alignment/bam/${histName}.sorted.bam -o $projPath/alignment/bam/${histName}_normalized.bw \
--binSize 10 \
--normalizeUsing RPGC \
--effectiveGenomeSize 3388488650 \
--extendReads \
--outFileFormat bedgraph
done
# Bigwig
cat filenames| while read histName
do
bamCoverage -b ${projPath}/alignment/bam/${histName}.sorted.bam -o $projPath/alignment/bam/${histName}_normalized.bw \
--binSize 10 \
--normalizeUsing RPGC \
--effectiveGenomeSize 3388488650 \
--extendReads
done
##== linux command ==##
cat filenames| while read histName
do
samtools view -bS -F 0x04 $projPath/alignment/removeDuplicate/${histName}_bowtie2.sorted.rmDup.sam >$projPath/alignment/bam/${histName}_rmDup.bam
done
mkdir -p $projPath/alignment/bigwig
cat filenames| while read histName
do
samtools sort -o $projPath/alignment/bam/${histName}_rmDup.sorted.bam $projPath/alignment/bam/${histName}_rmDup.bam
samtools index $projPath/alignment/bam/${histName}_rmDup.sorted.bam
done
chromSize="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/GRCm39.chrom.sizes"
# If you had run the command with --outFileFormat bedgraph, you could easily peak into the resulting file.
cat filenames| while read histName
do
bamCoverage -b ${projPath}/alignment/bam/${histName}_rmDup.sorted.bam -o $projPath/alignment/bam/${histName}_rmDup_normalized.bw \
--binSize 10 \
--normalizeUsing RPGC \
--effectiveGenomeSize 3388488650 \
--extendReads \
--outFileFormat bedgraph
#bigwig
bamCoverage -b ${projPath}/alignment/bam/${histName}_rmDup.sorted.bam -o $projPath/alignment/bam/${histName}_rmDup_normalized.bigwig \
--binSize 10 \
--normalizeUsing RPGC \
--effectiveGenomeSize 3388488650 \
--extendReads
done
cd $projPath/alignment/bedgraph
cat treatnames | while read treat
do
bash $projPath/SEACR_1.3.sh ${treat}_bowtie2.fragments.normalized.bedgraph IgG_bowtie2.fragments.normalized.bedgraph non stringent ${treat}_SEACR
done
cat treatnames | while read treat
do
bash $projPath/SEACR_1.3.sh ${treat}_rmDup_normalized.bw IgG_rmDup_normalized.bw norm stringent ${treat}_rmDup
done
cat treatnames | while read treat
do
bash $projPath/SEACR_1.3.sh ${treat}_rmDup_normalized.bw IgG_rmDup_normalized.bw non relax ${treat}_SEACR
done
cd $projPath/alignment/bam
cat treatnames | while read treat
do
bash $projPath/SEACR_1.3.sh ${treat}_normalized.bw IgG_normalized.bw non stringent ${treat}_SEACR
done
...
## --shift -100 --extsize 200 --nomodel -q 0.05
cat treatnames | while read treat
do
macs2 callpeak -t ${treat}.sorted.bam -f BAMPE -g mm -q 0.05 -B --SPMR -n ${treat} --outdir ./MACS2 2>${treat}.macs2.log
done
## --shift -100 --extsize 200 --nomodel -p 0.05
cat treatnames | while read treat
do
macs2 callpeak -t ${treat}.sorted.bam -f BAMPE -g mm -p 0.05 -B --SPMR -n ${treat} --outdir ./MACS2 2>${treat}.macs2.log
done
q0.05
p0.05
p0.01
## --shift -100 --extsize 200 --nomodel -c
cat treatnames | while read treat
do
macs2 callpeak -t ${treat}.sorted.bam -c IgG.sorted.bam -f BAMPE -g mm -p 0.05 -B --SPMR -n ${treat} --outdir ./MACS2 2>${treat}.macs2.log
done
q 0.05
p0.05
chromSize="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/GRCm39.chrom.sizes"
cat filenames| while read histName
do
gopeaks -b ${histName}.sorted.bam -c IgG.sorted.bam -s $chromSize -o ./$histName
done
chromSize="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/GRCm39.chrom.sizes"
cat filenames| while read histName
do
gopeaks -b ${histName}_rmDup.sorted.bam -c IgG_rmDup.sorted.bam -s $chromSize -o ./$histName
done
##== R command ==##
library("ChIPseeker")
library("org.Mm.eg.db")
library("GenomicFeatures")
#Mus_musculus.GRCm39.109.gtf在bioinformatic/GRCm39 ref文件夹中
txdb <- makeTxDbFromGFF("E:/硕士/bioinformatics/GRCm39 ref/Mus_musculus.GRCm39.109.gtf")
library("clusterProfiler")
library("ggplot2")
library("ggimage")
library("UpSetR")
library("ggpubr")
setwd=("E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/peak")
Foxg1 <- readPeakFile("Foxg1_peaks.narrowPeak")
Flag <- readPeakFile("Flag_peaks.narrowPeak")
Foxg1
#GRanges object with 39471 ranges and 0 metadata columns:
#seqnames ranges strand
#<Rle> <IRanges> <Rle>
#[1] 1 3165101-3165400 *
#[2] 1 3443001-3443400 *
#[3] 1 3462101-3462450 *
# [4] 1 3465301-3466050 *
# [5] 1 3470351-3470750 *
# ... ... ... ...
#[39467] Y 90810551-90812100 *
#[39468] Y 90814801-90815450 *
#[39469] Y 90818601-90820150 *
#[39470] Y 90821901-90822350 *
#[39471] Y 90851801-90853350 *
#-------
#seqinfo: 23 sequences from an unspecified genome; no seqlengths
# 制作多个样本比较的list
peaks <- list(FOXG1=Foxg1,FLAG=Flag)
# promotor区间范围可以自己设定
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
tagMatrixList <- lapply(peaks, getTagMatrix, windows=promoter)
plotAvgProf(tagMatrixList, xlim=c(-3000, 3000))
plotAvgProf(tagMatrixList, xlim=c(-3000, 3000), conf=0.95,resample=500, facet="row")
plotPeakProf2(peaks, upstream = 3000, downstream = 3000, conf = 0.95,
by = "gene", type = "start_site", TxDb = txdb,# by = 'gene', 'transcript', 'exon' or features of interest(e.g. "enhancer")
facet = "row", nbin = 800)
plotPeakProf2(peaks, upstream = rel(0.2), downstream = rel(0.2),
conf = 0.95, by = "gene", type = "body",
TxDb = txdb, facet = "row", nbin = 800)
#tagHeatmap(tagMatrixList)
# annotatePeak传入annoDb参数,可进行基因ID转换(Entrez,ENSEMBL,SYMBOL,GENENAME)
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb,tssRegion=c(-3000, 3000), verbose=FALSE)#annoDb="org.Mm.eg.db"
write.table(as.data.frame(peakAnnoList$FOXG1), file="FOXG1_peakAnno.xls", quote = F, row.names = F, sep = "\t")
write.table(as.data.frame(peakAnnoList$FLAG), file="FLAG_peakAnno.xls", quote = F, row.names = F, sep = "\t")
## 可视化
plotAnnoBar(peakAnnoList)
plotDistToTSS(peakAnnoList,title="Distribution of transcription factor-binding loci \n relative to TSS")
# 提取peakAnnolist中的基因,结合clusterProfiler包对peaks内的邻近基因进行富集注释。
# Create a list with genes from each sample
gene = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
vennplot(gene)
# Run GO enrichment analysis
# list用compareCluster函数做富集分析
ego <- compareCluster(geneCluster = gene,
fun = "enrichGO",
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "ALL", #One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE)
ego_BP <- compareCluster(geneCluster = gene,
fun = "enrichGO",
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "BP", #One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE)
ego_MF <- compareCluster(geneCluster = gene,
fun = "enrichGO",
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "MF", #One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE)
ego_CC <- compareCluster(geneCluster = gene,
fun = "enrichGO",
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "CC", #One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE)
# Dotplot visualization
BP=dotplot(ego_BP,
includeAll=FALSE,
showCategory=15,
font.size=10,
title = "GO:BP Pathway Enrichment Analysis")
MF=dotplot(ego_MF,
includeAll=FALSE,
showCategory=15,
font.size=10,
title = "GO:MF Pathway Enrichment Analysis")
CC=dotplot(ego_CC,
includeAll=FALSE,
showCategory=15,
font.size=10,
title = "GO:CC Pathway Enrichment Analysis")
pdf(file = "GO.pdf" , height = 10 ,width = 20 )
ggarrange(BP,MF,CC ,nrow = 1 ,align = "h")
dev.off()
# 也可以分开来做
ego1 <- enrichGO(gene = gene$FOXG1,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE)
ego2 <- enrichGO(gene = gene$FLAG,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE)
# Multiple samples KEGG analysis
##提取KEGG背景
library(msigdbr)
msigdbr_species()
mouse_KEGG = msigdbr(species = "Mus musculus",
category = "C2",
subcategory = "CP:KEGG") %>% dplyr::select(gs_name,ensembl_gene)
compKEGG <- compareCluster(geneCluster = gene,
fun = "enricher",
TERM2GENE = mouse_KEGG,
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
KEGG=dotplot(compKEGG, showCategory = 20, title = "KEGG Pathway Enrichment Analysis")
pdf(file = "KEGG.pdf" , height = 10 ,width = 10 )
KEGG
dev.off()
pdf(file = "Total enrich.pdf",height = 22,width = 22)
ggarrange(BP,MF,CC,KEGG ,ncol = 2 ,nrow = 2 ,align = "hv")
dev.off()
pdf(file = "Total enrich2.pdf",height = 12,width = 25)
ggarrange(BP,MF,CC,KEGG ,nrow = 1 )
dev.off()
##covplot
covplot_Foxg1=covplot(Foxg1, title = "Foxg1 CUT&TAG Peaks over Chromosomes",
weightCol="V5",chrs=c("1", "2", "3", "4",
"5", "6", "7",
"8", "9", "10",
"11", "12", "13",
"14", "15", "16",
"17", "18", "19"))
covplot_Flag=covplot(Flag, title = "Flag CUT&TAG Peaks over Chromosomes",
weightCol="V5",chrs=c("1", "2", "3", "4",
"5", "6", "7",
"8", "9", "10",
"11", "12", "13",
"14", "15", "16",
"17", "18", "19"))
pdf(file = "covplot.pdf",height = 10,width = 20)
ggarrange(covplot_Foxg1,covplot_Flag,nrow = 1)
dev.off()
cores=20
##total
computeMatrix scale-regions -S $projPath/alignment/bam/Foxg1_normalized.bw \
$projPath/alignment/bam/Flag_normalized.bw \
-R $projPath/Mus_musculus.GRCm39.109.bed12 \
--beforeRegionStartLength 2000 \
--regionBodyLength 5000 \
--afterRegionStartLength 2000 \
-o $projPath/TOTAL_TSSTES_matrix_gene_at_genome.mat.gz \
-p $cores --skipZeros
plotHeatmap -m $projPath/TOTAL_TSSTES_matrix_gene_at_genome.mat.gz \
-out TOTAL_TSSTESHeatmap.pdf \
--colorMap OrRd \
--whatToShow 'plot, heatmap and colorbar' \
--heatmapHeight 17 \
--legendLocation best
##TSS
#--referencePoint TSS -b 5000 -a 5000
computeMatrix reference-point -p $cores \
--referencePoint TSS \
-b 5000 -a 5000 \
-R $projPath/Mus_musculus.GRCm39.109.bed12 \
-S $projPath/alignment/bigwig/Foxg1_raw.bw \
--skipZeros \
-out $projPath/Foxg1_TSS_matrix_gene_at_genome.mat.gz
plotHeatmap -m $projPath/Foxg1_TSS_matrix_gene_at_genome.mat.gz \
-out Foxg1TSS.peakHeatmap.pdf \
--outFileSortedRegions $projPath/alignment/clusterbed/FOXG1_TSS_cluster_regions.bed \
--colorMap OrRd \
--whatToShow 'plot, heatmap and colorbar' \
--kmeans 5 \
--heatmapHeight 15 \
--legendLocation best
##peak
###foxg1
computeMatrix reference-point -p $cores \
--referencePoint center \
-a 3000 -b 3000 \
-R $projPath/alignment/MACS2/Foxg1_peaks.narrowPeak \
-S $projPath/alignment/bam/Foxg1_normalized.bw \
--skipZeros \
-out $projPath/FOXG1peak_matrix_gene.mat.gz
plotHeatmap -m $projPath/FOXG1peak_matrix_gene.mat.gz \
-out FOXG1.peakHeatmap.pdf \
--outFileSortedRegions $projPath/alignment/clusterbed/FOXG1_cluster_regions.bed \
--colorMap OrRd \
--whatToShow 'plot, heatmap and colorbar' \
--kmeans 5 \
--heatmapHeight 15 \
--legendLocation best
###flag
computeMatrix reference-point -p $cores \
--referencePoint center \
-b 3000 -a 3000 \
-R $projPath/alignment/MACS2/Flag_peaks.narrowPeak \
-S $projPath/alignment/bam/Flag_normalized.bw \
--skipZeros \
-out $projPath/FLAGpeak_matrix_gene.mat.gz
plotHeatmap -m $projPath/FLAGpeak_matrix_gene.mat.gz \
-out FLAG.peakHeatmap.pdf \
--colorMap OrRd \
--whatToShow 'plot, heatmap and colorbar' \
--heatmapHeight 15 \
--legendLocation best
gimme motifs Foxg1_peaks.narrowPeak FOXG1.motif -g ~/.local/share/genomes/Mus_musculus.GRCm39.dna.toplevel -p HOMER --known
gimme motifs Flag_peaks.narrowPeak FLAG.motif -g ~/.local/share/genomes/Mus_musculus.GRCm39.dna.toplevel -p HOMER --known
gimme maelstrom -p HOMER for_motif_FOXG1_peakcenter ~/.local/share/genomes/Mus_musculus.GRCm39.dna.toplevel ./cluster_motif_res_Homer
gimme maelstrom -p JASPAR2020_vertebrates for_motif_FOXG1_peakcenter ~/.local/share/genomes/Mus_musculus.GRCm39.dna.toplevel ./motif_res_JASPAR_FOXG1_peakcenter
from gimmemotifs.maelstrom import MaelstromResult
import matplotlib.pyplot as plt
mr = MaelstromResult("/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/motif_res")
mr.plot_heatmap(threshold=1,figsize=(6,8))
plt.savefig("/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/motif_res_Homer_FOXG1_peakcenter/FOXG1_231201_motif_heatmap_Homer.pdf")
jaspar
#23.12.1
#/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/cluster_motif_res_Homer
from gimmemotifs.maelstrom import MaelstromResult
import matplotlib.pyplot as plt
mr = MaelstromResult("/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/cluster_motif_res_Homer")
mr.plot_heatmap(threshold=1,figsize=(5,10))
plt.savefig("/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/cluster_motif_res_Homer/FOXG1_2308_motif_heatmap_Homer.pdf")
library("ChIPseeker")
library("org.Mm.eg.db")
library("GenomicFeatures")
library("clusterProfiler")
library("ggplot2")
library("ggimage")
library("UpSetR")
setwd("E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/cluster")
txdb <- makeTxDbFromGFF("E:/硕士/bioinformatics/GRCm39 ref/Mus_musculus.GRCm39.109.gtf")
cluster1 <- readPeakFile("c1.bed")
cluster2 <- readPeakFile("c2.bed")
cluster3 <- readPeakFile("c3.bed")
cluster4 <- readPeakFile("c4.bed")
cluster5 <- readPeakFile("c5.bed")
peaks <- list(cluster1=cluster1,
cluster2=cluster2,
cluster3=cluster3,
cluster4=cluster4,
cluster5=cluster5)
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
tagMatrixList <- lapply(peaks, getTagMatrix, windows=promoter)
tagHeatmap(tagMatrixList)
peakAnnoList <- lapply(peaks, annotatePeak,
TxDb=txdb,tssRegion=c(-3000, 3000),
verbose=FALSE)
plotAnnoBar(peakAnnoList,xlab="", ylab='Percentage(%)')
plotDistToTSS(peakAnnoList,title="Distribution of transcription factor-binding loci \n relative to TSS")
gene = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
vennplot(gene)
# Run GO enrichment analysis
ego <- compareCluster(geneCluster = gene,
fun = "enrichGO",
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE)
# Dotplot visualization
A = dotplot(ego,
includeAll=FALSE,
showCategory=10,
font.size=10,
title = "GO:BP Pathway Enrichment Analysis")
#KEGG
library(msigdbr)
msigdbr_species()
mouse_KEGG = msigdbr(species = "Mus musculus",
category = "C2",
subcategory = "CP:KEGG") %>% dplyr::select(gs_name,ensembl_gene)
compKEGG <- compareCluster(geneCluster = gene,
fun = "enricher",
TERM2GENE = mouse_KEGG,
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
B = dotplot(compKEGG, showCategory = 10,
#color = "pvalue",
title = "KEGG Pathway Enrichment Analysis" ,shape = T)
C=A+B
C
pdf(file = "GO + KEGG.pdf" , width = 20,height = 15)
dev.off()
write.table(compKEGG ,file = "KEGG.xls" , sep = "\t" ,quote = F ,row.names = F )
write.table(ego ,file = "GO.xls" , sep = "\t" ,quote = F ,row.names = F )
library(transPlotR)
library(rtracklayer)
library(aplot)
library(grid)
library(ggplotify)
bdfile <- list.files(pattern = '.bed')
bdfile
bedVis(bdFile = bdfile,
chr = "7",
track.width = 0.3,
show.legend = T)
gtf <- BiocIO::import('E:/硕士/bioinformatics/GRCm39 ref/Mus_musculus.GRCm39.109.gtf',format = "gtf") %>% data.frame()
#loadBigWig
file <- list.files(pattern = '.bw')
mybw <- loadBigWig(file)
# check
head(mybw,3)
# color aes by transcript
p <-
trancriptVis(gtfFile = gtf,
gene = "Ube3a",
relTextDist = -0.5,
exonWidth = 0.5,
exonFill = 'black',
arrowCol = 'black',
textLabelSize = 3,
addNormalArrow = FALSE,
newStyleArrow = TRUE,
xAxis.info = FALSE,
textLabel = 'transcript_name',
circle = F,collapse = T) +
xlab('')
# track + gene + peak
Ube3a <-
trackVis(bWData = mybw,
gtf.file = gtf,
gene.name = "Ube3a",
extend.up = 3000,
extend.dn = 3000,
xAxis.info = F,
theme = "bw",
yAxis.info = F,new.yaxis = T,new.label = T,
pos.ratio = c(0.06,0.8),
color = jjAnno::useMyCol(platte = "stallion",n = 3))
# peak
bd1 <- bedVis(bdFile = bdfile,
chr = "7",
track.width = 0.5,
show.legend = F,
fill=jjAnno::useMyCol(platte = "stallion",n = 2))
# combine
c1 <- Ube3a %>% insert_bottom(p,height = 0.30) %>%
insert_bottom(bd1,height = 0.10)
c1
c1 <- as.grob(c1)
ggsave("Ube3a.pdf")
# color aes by transcript
p2 <-
trancriptVis(gtfFile = gtf,
gene = "Ube3c",
relTextDist = -0.5,
exonWidth = 0.5,
exonFill = 'black',
arrowCol = 'black',
textLabelSize = 3,
addNormalArrow = FALSE,
newStyleArrow = TRUE,
xAxis.info = FALSE,
textLabel = 'transcript_name',
circle = F,collapse = T) +
xlab('')
# track + gene + peak
Ube3c <-
trackVis(bWData = mybw,
gtf.file = gtf,
gene.name = "Ube3c",
extend.up = 3000,
extend.dn = 3000,
xAxis.info = F,
theme = "bw",
yAxis.info = F,new.yaxis = T,new.label = T,
pos.ratio = c(0.06,0.8),
color = jjAnno::useMyCol(platte = "stallion",n = 3))
# peak
bd2 <- bedVis(bdFile = bdfile,
chr = "5",
track.width = 0.3,
show.legend = F,
fill=jjAnno::useMyCol(platte = "stallion",n = 2))
# combine
c2 <- Ube3c %>% insert_bottom(p2,height = 0.3) %>%
insert_bottom(bd2,height = 0.1)
c2
c2 <- as.grob(c2)
ggsave("Ube3c.pdf")
# combine
library(gridExtra)
grid.arrange(c1, c2, ncol=2)
ggsave("Ube3c.pdf")
library("ChIPseeker")
library("org.Mm.eg.db")
library("GenomicFeatures")
library("clusterProfiler")
library("ggplot2")
library("ggimage")
library("UpSetR")
library("ggGenshin")
library("GeneOverlap")
library("ggVennDiagram")
library("VennDiagram")
txdb <- makeTxDbFromGFF("E:/硕士/bioinformatics/GRCm39 ref/Mus_musculus.GRCm39.109.gtf")
cluster1 <- readPeakFile("c1.bed")
cluster2 <- readPeakFile("c2.bed")
cluster3 <- readPeakFile("c3.bed")
cluster4 <- readPeakFile("c4.bed")
cluster5 <- readPeakFile("c5.bed")
peaks <- list(cluster1=cluster1,
cluster2=cluster2,
cluster3=cluster3,
cluster4=cluster4,
cluster5=cluster5)
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
tagMatrixList <- lapply(peaks, getTagMatrix, windows=promoter)
tagHeatmap(tagMatrixList)
peakAnnoList <- lapply(peaks, annotatePeak,
TxDb=txdb,tssRegion=c(-3000, 3000),
verbose=FALSE)
plotAnnoBar(peakAnnoList,xlab="", ylab='Percentage(%)')
plotDistToTSS(peakAnnoList,title="Distribution of transcription factor-binding loci \n relative to TSS")
gene = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
peak <- readPeakFile("FOXG1_cluster_regions.xls")
peakanno <- annotatePeak(peak,
TxDb = txdb,
tssRegion=c(-3000, 3000),
verbose=FALSE)
write.csv(peakanno,"total_peak_anno.csv")
peakanno <- as.data.frame(peakanno)
OE_vs_HD <- read.table("OE_vs_HD_result_data_anno.xls",sep = "\t",header = T)
merge <- merge(peakanno,OE_vs_HD,by = "geneId")
# filter by LFC and padj
merge1 <- merge[(abs(merge$log2FoldChange)< 7),]
merge_filt<- merge[(abs(merge$log2FoldChange)>= 0.5 & abs(merge$log2FoldChange)< 7 &
(merge$pvalue<= 0.05)),]
merge_filt<- as.data.frame(merge_filt)
merge_filt <- merge_filt[!is.na(merge_filt$cluster_1), ]
# Violin plot of DEG distribution in each cluster LFC.cutoff=0.5 for dots
#my_palette <- brewer.pal(name="Blues",n=8)[3:8]
ggGenshin::keys()
my_palette <- pal_xiao(alpha = 1)(5)
p_Foxg1 <- ggplot(merge1,
aes(x=cluster_1, y=log2FoldChange, fill=cluster_1, color= cluster_1, alpha=0.5, font=12))+
scale_color_manual(values = my_palette, aesthetics = "fill")+
scale_color_manual(values = my_palette, aesthetics = "colour")+
geom_violin()+
labs(x="cluster", y = "Log2FC")+ theme_light()+
stat_summary(fun=median, geom="point", size=2, color="black") +
theme(axis.text.x = element_text(size=12,angle =60),
axis.text.y = element_text(size=12),
axis.title = element_text(size=12))
violin_p_Foxg1<- p_Foxg1 + geom_jitter( data= merge_filt,
shape=16,
size=2.5,alpha = 0.5,
position=position_jitter(width=0.2, height= 0.2))+
stat_summary(fun=median, geom="point", size=2, color="black")
violin_p_Foxg1
ggsave("violin_plot_OE_vs_HD.pdf")
##KO vs WT
KO_vs_WT <- read.table("KO_vs_WT_result_data_anno.xls",sep = "\t",header = T)
mergeKO <- merge(peakanno,KO_vs_WT,by = "geneId")
# filter by LFC and padj
mergeKO1 <- mergeKO[(abs(mergeKO$log2FoldChange)< 7),]
mergeKO_filt<- mergeKO[(abs(mergeKO$log2FoldChange)>= 0.5 & abs(mergeKO$log2FoldChange)< 7 &
(mergeKO$padj<= 0.05)),]
mergeKO_filt<- as.data.frame(mergeKO_filt)
mergeKO_filt <- mergeKO_filt[!is.na(mergeKO_filt$cluster_1), ]
# Violin plot of DEG distribution in each cluster LFC.cutoff=0.5 for dots
#my_palette <- brewer.pal(name="Blues",n=8)[3:8]
ggGenshin::keys()
my_palette <- pal_nahida(alpha = 1)(5)
p_Foxg1_2 <- ggplot(mergeKO1,
aes(x=cluster_1, y=log2FoldChange, fill=cluster_1, color= cluster_1, alpha=0.5, font=12))+
scale_color_manual(values = my_palette, aesthetics = "fill")+
scale_color_manual(values = my_palette, aesthetics = "colour")+
geom_violin()+
labs(x="cluster", y = "Log2FC")+ theme_light()+
stat_summary(fun=median, geom="point", size=2, color="black") +
theme(axis.text.x = element_text(size=12,angle =60),
axis.text.y = element_text(size=12),
axis.title = element_text(size=12))
p_Foxg1_2
violin_p_Foxg1_2<- p_Foxg1_2 + geom_jitter( data= mergeKO_filt,
shape=16,
size=2.5,alpha = 0.5,
position=position_jitter(width=0.2, height= 0.2))+
stat_summary(fun=median, geom="point", size=2, color="black")
violin_p_Foxg1_2
###correlation
# read in K-means clustered peaks files (from the heatmap above)
cluster1 <- peakanno[(peakanno$cluster_1 == "cluster_1"),]
cluster2 <- peakanno[(peakanno$cluster_1 == "cluster_2"),]
cluster3 <- peakanno[(peakanno$cluster_1 == "cluster_3"),]
cluster4 <- peakanno[(peakanno$cluster_1 == "cluster_4"),]
cluster5 <- peakanno[(peakanno$cluster_1 == "cluster_5"),]
# h3k27ac cluster list with annotations (gain, loss, static)
foxg1_list<- list(static_foxg1= cluster1$geneId,
loss1_foxg1= cluster2$geneId,
loss2_foxg1= cluster3$geneId,
gain_foxg1= cluster4$geneId,
static_no_foxg1= cluster5$geneId)
# Upload DEG table
OE_vs_HD
KO_vs_WT
# filter according to increased and decreased gene expression
increased_DEG_OE<-OE_vs_HD[(OE_vs_HD$log2FoldChange>=0.5 &
OE_vs_HD$pvalue<=0.05),]
decreased_DEG_OE<-OE_vs_HD[(OE_vs_HD$log2FoldChange<= -0.5 &
OE_vs_HD$pvalue<=0.05),]
static_DEG_OE<-OE_vs_HD[(abs(OE_vs_HD$log2FoldChange)<=0.5 & OE_vs_HD$pvalue>0.05),]
# combine filtered DEGs in a list
DEG_list_OE<- list(increased_DEG= increased_DEG_OE$geneId,
decreased_DEG=decreased_DEG_OE$geneId,
static=static_DEG_OE$geneId)
# Prepare Geneoverlap matrix for enrichment test- clustered H3K27ac-DEG overlap
GO_matrix_foxg1OE<-newGOM(foxg1_list, DEG_list_OE, genome.size = 24500)
# oddsratio heatmap for strength of association between lists
heatmap_OE<- drawHeatmap(GO_matrix_foxg1OE,
what = c("odds.ratio"),
adj.p=T,
cutoff=0.05,
ncolused = 5,
grid.col = "Blues",
note.col = "Black")
##KO
# filter according to increased and decreased gene expression
increased_DEG_KO<-KO_vs_WT[(KO_vs_WT$log2FoldChange>=0.5 &
KO_vs_WT$padj<=0.05),]
decreased_DEG_KO<-KO_vs_WT[(KO_vs_WT$log2FoldChange<= -0.5 &
KO_vs_WT$padj<=0.05),]
static_DEG_KO<-KO_vs_WT[(abs(KO_vs_WT$log2FoldChange)<=0.5 & KO_vs_WT$padj>0.05),]
# combine filtered DEGs in a list
DEG_list_KO<- list(increased_DEG= increased_DEG_KO$geneId,
decreased_DEG=decreased_DEG_KO$geneId,
static=static_DEG_KO$geneId)
# Prepare Geneoverlap matrix for enrichment test- clustered H3K27ac-DEG overlap
GO_matrix_foxg1KO<-newGOM(foxg1_list, DEG_list_KO, genome.size = 24500)
GO_matrix_foxg1KO
#> GO_matrix_foxg1KO
#A <5 x 3> GeneOverlapMatrix object
#Geneset A sizes:
# static_foxg1 loss1_foxg1 loss2_foxg1 gain_foxg1 static_no_foxg1
#1790 9775 11413 29431 30564
#Geneset B sizes:
# increased_DEG decreased_DEG static
#1239 1655 12491
# oddsratio heatmap for strength of association between lists
heatmap_KO<- drawHeatmap(GO_matrix_foxg1KO,
what = c("odds.ratio"),
log.scale = F,
adj.p=T,
cutoff=0.05,
ncolused = 5,
grid.col = "Blues",
note.col = "Black")
Foxg1_KD_DEGs<-read.table("DE_genes_shrinked_apeglm_DIV11.tabular",
sep="\t", header = TRUE,)
Foxg1_KD_DEGs_df<- as.data.frame(Foxg1_KD_DEGs)
Foxg1_KD_DEGs_df$log2FoldChange<-as.numeric(gsub(",", ".", Foxg1_KD_DEGs_df$log2FoldChange))
nrow(Foxg1_KD_DEGs_df)
nrow(Foxg1_KD_DEGs)
OE vs HD
KO vs WT
>